{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Experiment with parameters for a Ridge Regression Model on the Diabetes Dataset in an Azure ML Pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook is for experimenting with different parameters to train a ridge regression model on the Diabetes dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Change out of the experimentation directory\n",
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import azureml.core\n",
    "from azureml.core import Workspace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the workspace from the saved config file\n",
    "ws = Workspace.from_config()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, shutil\n",
    "\n",
    "# Create a folder for the experiment files\n",
    "training_folder = 'diabetes-training'\n",
    "os.makedirs(training_folder, exist_ok=True)\n",
    "\n",
    "# Copy the data file into the experiment folder\n",
    "shutil.copy('data/diabetes.csv', os.path.join(training_folder, \"diabetes.csv\"))\n",
    "\n",
    "# Copy the train functions into the experiment folder\n",
    "shutil.copy('diabetes_regression/training/train.py', os.path.join(training_folder, \"train.py\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile $training_folder/parameters.json\n",
    "{\n",
    "    \"training\":\n",
    "    {\n",
    "        \"alpha\": 0.3\n",
    "    },\n",
    "    \"evaluation\":\n",
    "    {\n",
    "\n",
    "    },\n",
    "    \"scoring\":\n",
    "    {\n",
    "        \n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile $training_folder/diabetes_training.py\n",
    "# Import libraries\n",
    "from azureml.core import Run\n",
    "import pandas as pd\n",
    "import shutil\n",
    "import joblib\n",
    "\n",
    "from train import split_data, train_model\n",
    "\n",
    "# Get parameters\n",
    "parser = argparse.ArgumentParser()\n",
    "parser.add_argument('--output_folder', type=str, dest='output_folder', default=\"diabetes_model\", help='output folder')\n",
    "args = parser.parse_args()\n",
    "output_folder = args.output_folder\n",
    "\n",
    "# Get the experiment run context\n",
    "run = Run.get_context()\n",
    "\n",
    "# load the diabetes dataset\n",
    "print(\"Loading Data...\")\n",
    "train_df = pd.read_csv('diabetes.csv')\n",
    "\n",
    "data = split_data(train_df)\n",
    "\n",
    "# Specify the parameters to test\n",
    "with open(\"parameters.json\") as f:\n",
    "    pars = json.load(f)\n",
    "    train_args = pars[\"training\"]\n",
    "\n",
    "# Log parameters\n",
    "for k, v in train_args.items():\n",
    "    run.log(k, v)\n",
    "\n",
    "model, metrics = train_model(data, train_args)\n",
    "\n",
    "# Log metrics\n",
    "for k, v in metrics.items():\n",
    "    run.log(k, v)\n",
    "\n",
    "# Save the parameters file to the outputs folder\n",
    "os.makedirs(output_folder, exist_ok=True)\n",
    "shutil.copy('parameters.json', os.path.join(output_folder, 'parameters.json'))\n",
    "joblib.dump(value=model, filename= output_folder + \"/model.pkl\")\n",
    "    \n",
    "run.complete()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile $training_folder/register_diabetes.py\n",
    "# Import libraries\n",
    "import argparse\n",
    "import joblib\n",
    "from azureml.core import Workspace, Model, Run\n",
    "\n",
    "# Get parameters\n",
    "parser = argparse.ArgumentParser()\n",
    "parser.add_argument('--model_folder', type=str, dest='model_folder', default=\"diabetes_model\", help='model location')\n",
    "args = parser.parse_args()\n",
    "model_folder = args.model_folder\n",
    "\n",
    "# Get the experiment run context\n",
    "run = Run.get_context()\n",
    "\n",
    "# load the model\n",
    "print(\"Loading model from \" + model_folder)\n",
    "model_file = model_folder + \"/model.pkl\"\n",
    "model = joblib.load(model_file)\n",
    "\n",
    "Model.register(workspace=run.experiment.workspace,\n",
    "               model_path = model_file,\n",
    "               model_name = 'diabetes_model',\n",
    "               tags={'Training context':'Pipeline'})\n",
    "\n",
    "run.complete()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.core.compute import ComputeTarget, AmlCompute\n",
    "from azureml.core.compute_target import ComputeTargetException\n",
    "\n",
    "cluster_name = \"aml-cluster\"\n",
    "\n",
    "# Verify that cluster exists\n",
    "try:\n",
    "    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\n",
    "    print('Found existing cluster, use it.')\n",
    "except ComputeTargetException:\n",
    "    # If not, create it\n",
    "    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n",
    "                                                           max_nodes=4,\n",
    "                                                           idle_seconds_before_scaledown=1800)\n",
    "    pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\n",
    "\n",
    "pipeline_cluster.wait_for_completion(show_output=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.core import Environment\n",
    "from azureml.core.conda_dependencies import CondaDependencies\n",
    "from azureml.core.runconfig import RunConfiguration\n",
    "\n",
    "# Create a Python environment for the experiment\n",
    "diabetes_env = Environment(\"diabetes-pipeline-env\")\n",
    "diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies\n",
    "diabetes_env.docker.enabled = True # Use a docker container\n",
    "\n",
    "# Create a set of package dependencies\n",
    "diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],\n",
    "                                             pip_packages=['azureml-sdk'])\n",
    "\n",
    "# Add the dependencies to the environment\n",
    "diabetes_env.python.conda_dependencies = diabetes_packages\n",
    "\n",
    "# Register the environment (just in case you want to use it again)\n",
    "diabetes_env.register(workspace=ws)\n",
    "registered_env = Environment.get(ws, 'diabetes-pipeline-env')\n",
    "\n",
    "# Create a new runconfig object for the pipeline\n",
    "pipeline_run_config = RunConfiguration()\n",
    "\n",
    "# Use the compute you created above. \n",
    "pipeline_run_config.target = pipeline_cluster\n",
    "\n",
    "# Assign the environment to the run configuration\n",
    "pipeline_run_config.environment = registered_env\n",
    "\n",
    "print (\"Run configuration created.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.pipeline.core import PipelineData\n",
    "from azureml.pipeline.steps import PythonScriptStep, EstimatorStep\n",
    "from azureml.train.estimator import Estimator\n",
    "\n",
    "# Get the training dataset\n",
    "#diabetes_ds = ws.datasets.get(\"diabetes dataset\")\n",
    "\n",
    "# Create a PipelineData (temporary Data Reference) for the model folder\n",
    "model_folder = PipelineData(\"model_folder\", datastore=ws.get_default_datastore())\n",
    "\n",
    "estimator = Estimator(source_directory=training_folder,\n",
    "                        compute_target = pipeline_cluster,\n",
    "                        environment_definition=pipeline_run_config.environment,\n",
    "                        entry_script='diabetes_training.py')\n",
    "\n",
    "# Step 1, run the estimator to train the model\n",
    "train_step = EstimatorStep(name = \"Train Model\",\n",
    "                           estimator=estimator, \n",
    "                           estimator_entry_script_arguments=['--output_folder', model_folder],\n",
    "                           outputs=[model_folder],\n",
    "                           compute_target = pipeline_cluster,\n",
    "                           allow_reuse = True)\n",
    "\n",
    "# Step 2, run the model registration script\n",
    "register_step = PythonScriptStep(name = \"Register Model\",\n",
    "                                source_directory = training_folder,\n",
    "                                script_name = \"register_diabetes.py\",\n",
    "                                arguments = ['--model_folder', model_folder],\n",
    "                                inputs=[model_folder],\n",
    "                                compute_target = pipeline_cluster,\n",
    "                                runconfig = pipeline_run_config,\n",
    "                                allow_reuse = True)\n",
    "\n",
    "print(\"Pipeline steps defined\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.core import Experiment\n",
    "from azureml.pipeline.core import Pipeline\n",
    "from azureml.widgets import RunDetails\n",
    "\n",
    "# Construct the pipeline\n",
    "pipeline_steps = [train_step, register_step]\n",
    "pipeline = Pipeline(workspace = ws, steps=pipeline_steps)\n",
    "print(\"Pipeline is built.\")\n",
    "\n",
    "# Create an experiment and run the pipeline\n",
    "experiment = Experiment(workspace = ws, name = 'diabetes-training-pipeline')\n",
    "pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\n",
    "print(\"Pipeline submitted for execution.\")\n",
    "\n",
    "RunDetails(pipeline_run).show()\n",
    "pipeline_run.wait_for_completion()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.core import Model\n",
    "\n",
    "for model in Model.list(ws):\n",
    "    print(model.name, 'version:', model.version)\n",
    "    for tag_name in model.tags:\n",
    "        tag = model.tags[tag_name]\n",
    "        print ('\\t',tag_name, ':', tag)\n",
    "    for prop_name in model.properties:\n",
    "        prop = model.properties[prop_name]\n",
    "        print ('\\t',prop_name, ':', prop)\n",
    "    print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
